# set up
import pandas as pd
import numpy as np
from collections import defaultdict
# models
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
# nlp
import nltk
from nltk.tokenize import word_tokenize # nltk tokenizer
nltk_stopwords = nltk.corpus.stopwords.words('english') # nltk stop words
from nltk.util import ngrams
# sys
import warnings
warnings.filterwarnings('ignore')
# visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
cluster reviewers based on their review text, and gain the insights from each cluster to understand the perception of price, quality, and value for each product for each cluster of customers.
# read from pickle stored from previous preprocessing work
df = pd.read_pickle("df_all.pkl")
df.shape
df = df.dropna(how = 'all',subset = ["Sound Bite Text"]) # drop rows with review text missing
# drop rows with review text deleted
df.drop(df[(df["Sound Bite Text"] == "Post deleted by the author.")].index, inplace = True)
df.shape
# resample the data using stratification on the product class
from sklearn.utils import resample
df_new = resample(df, n_samples = 2000, replace=False, random_state=0)
# df_stratified = df.loc[products_new.index]
len(df_new)
Hypothesis: from previous EDA for titles we know that topics are mostly around the products, and we know that there are three products: Samsung Galaxy, Iphone x, Iphone 8. There are three dimensions that reviewers care about price, quality, and value. So based on these two assumptions the clusters should be 6 to 9.
# test the k clusters on the subset
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df_new["Sound Bite Text"])
# find out the optimal clusters
Sum_of_squared_distances = []
K = range(1,15)
for k in K:
km = KMeans(n_clusters=k, init='k-means++', max_iter=1000, n_init=1)
km = km.fit(X)
Sum_of_squared_distances.append(km.inertia_)
# plot the sum of squared distances
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.title('Elbow Method For Optimal k')
plt.show()
The plot confirms with our previous assumption that the optimal k lies between 6 and 9 as the marginal decrement of the sum squared distance slows down at this point. So let's set k = 6.
# pipiline
text_cluster = Pipeline([('vect', CountVectorizer(stop_words = 'english')),
('tfidf', TfidfTransformer()),
('km', KMeans(n_clusters = 6, init='k-means++', max_iter=1000, n_init=1))])
text_cluster.fit(df_new["Sound Bite Text"])
# vectorize data with tf-idf transformer
documents = df["Sound Bite Text"]
# vector transform and remvoe stop words
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)
# save to pickle
from sklearn.externals import joblib
# dump to pickle
joblib.dump(X, 'x_vector.pkl')
# load from pickel
X = joblib.load('x_vector.pkl')
X.shape
# train model and choose k
true_k = 6
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, n_init=1)
model.fit(X)
%time
# dump to pickle
# joblib.dump(model, 'model.pkl')
# and reload from pickle
model = joblib.load('model.pkl')
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
# create dict for clusters
from collections import defaultdict
clusters = defaultdict(list)
for i in range(6):
print("===========")
print("Cluster %d:" % i)
print("===========")
for ind in order_centroids[i, : 10]:
print(' %s' % terms[ind])
for ind in order_centroids[i, : 25]:
clusters[i].append(terms[ind])
The top 10 keywords from each clusters tell us certain features for the price, quality, and value dimensions for each product.
Cluster one: apple's iphone 8 plus's new charging, wireless, and ios, watch
Cluster two: twitter samsung, and iphone
Cluster three: iphone plus's camera and comparison to pixel
Cluster four: galaxy s8, android, google, screen, and comparison to LG
Cluster five: galaxy s8, bixby, display
Cluster six: apple's iphone's new charging, camera, wireless, and comparison to 7s.
model.labels_.shape
df["clusters"] = model.labels_
df.drop(columns = ["Post Type", "Media Type", "Author ID", "Author Name","Quoted Post",'Quoted Author Name', 'Quoted Author Handle',
'Total Engagements', 'Post Comments', 'Post Likes', 'Post Shares',
'Product Name'], inplace = True)
a = df.groupby("clusters")
cluster1 = a.get_group(0)
no_1 = " ".join([i for i in list(cluster1["Negative Objects"].dropna())])
# tokenize and preprocess tokens
tokens_no1 = word_tokenize(no_1)
tokens_no1 = [token.lower() for token in tokens_no1 if token.isalpha()]
tokens_no1 = [token for token in tokens_no1 if token.isalpha()]
tokens_no1 = [token for token in tokens_no1 if token not in nltk_stopwords]
# to string
no1 = " ".join([i for i in tokens_no1])
len(tokens_no1)
def countToken(token_list):
dct = defaultdict(int)
for token in token_list:
if token not in dct:
dct[token] = 1
else:
dct[token] += 1
return dct
d = countToken(tokens_no1)
key = sorted(d, key = d.get, reverse = True)
# visualization
plt.figure(figsize = (20,4))
height = [d[k] for k in key][:20]
bars = key[:20]
y_pos = np.arange(len(bars))
plt.bar(y_pos, height, color=('tab:cyan'))
plt.xticks(y_pos, bars)
plt.show()
no1 = " ".join([i for i in tokens_no1])
no1 = no1.lower()
# Create the wordcloud object
wordcloud = WordCloud(width = 400, height = 200,margin = 0,
background_color = 'white', colormap = 'Set2',max_words=200000, scale = 10).generate(no1)
plt.figure(figsize = (15,30))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig("wordcloud_cluster1.png")
plt.show()
po_1 = " ".join([i for i in list(cluster1["Positive Objects"].dropna())])
tokens_po_1 = word_tokenize(po_1)
tokens_po_1 = [token for token in tokens_po_1 if token.isalpha()]
tokens_po_1 = [token for token in tokens_po_1 if token not in nltk_stopwords]
po_1 = " ".join([i for i in tokens_po_1])
po_1 = po_1.lower()
d = countToken(tokens_po_1)
key = sorted(d, key = d.get, reverse = True)
# visualization
plt.figure(figsize = (19,4))
height = [d[k] for k in key][:20]
bars = key[:20]
y_pos = np.arange(len(bars))
plt.bar(y_pos, height, color=('tab:cyan'))
plt.xticks(y_pos, bars)
plt.show()
# Create the wordcloud object
wordcloud = WordCloud(width = 400, height = 200,margin = 0,
background_color = 'white', colormap = 'Set2',max_words=200000, scale = 10).generate(po_1)
plt.figure(figsize = (15,30))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig("wordcloud_cluster1_po.png")
plt.show()
cluster2 = a.get_group(1)
no = " ".join([i for i in list(cluster2["Negative Objects"].dropna())])
po = " ".join([i for i in list(cluster2["Positive Objects"].dropna())])
def genToken(corpus):
# preprocess str corpus into tokens
tokens_no1 = word_tokenize(corpus)
tokens_no1 = [token.lower() for token in tokens_no1 if token.isalpha()]
tokens_no1 = [token for token in tokens_no1 if token.isalpha()]
tokens_no1 = [token for token in tokens_no1 if token not in nltk_stopwords]
# to string
no1 = " ".join([i for i in tokens_no1])
return tokens_no1, no1
no_token,no_str = genToken(no)
po_token,po_str = genToken(po)
def toBar(tokens):
d = countToken(tokens)
key = sorted(d, key = d.get, reverse = True)
# visualization for top 20 key words
plt.figure(figsize = (19,4))
height = [d[k] for k in key][:20]
bars = key[:20]
y_pos = np.arange(len(bars))
plt.bar(y_pos, height, color=('tab:cyan'))
plt.xticks(y_pos, bars)
plt.show()
toBar(no_token)
toBar(po_token)
def toCloud(cleanStr):
# Create the wordcloud object
wordcloud = WordCloud(width = 400, height = 200,margin = 0,
background_color = 'white', colormap = 'Set2',max_words=200000, scale = 10).generate(cleanStr)
plt.figure(figsize = (15,30))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
toCloud(no_str)
toCloud(po_str)
cluster3 = a.get_group(2)
no = " ".join([i for i in list(cluster3["Negative Objects"].dropna())])
po = " ".join([i for i in list(cluster3["Positive Objects"].dropna())])
no_token,no_str = genToken(no)
po_token,po_str = genToken(po)
toBar(no_token)
toBar(po_token)
toCloud(no_str)
toCloud(po_str)
cluster3 = a.get_group(3)
no = " ".join([i for i in list(cluster3["Negative Objects"].dropna())])
po = " ".join([i for i in list(cluster3["Positive Objects"].dropna())])
no_token,no_str = genToken(no)
po_token,po_str = genToken(po)
# to bar
toBar(no_token)
toBar(po_token)
# to word cloud
toCloud(no_str)
toCloud(po_str)
cluster5 = a.get_group(4)
no = " ".join([i for i in list(cluster5["Negative Objects"].dropna())])
po = " ".join([i for i in list(cluster5["Positive Objects"].dropna())])
no_token,no_str = genToken(no)
po_token,po_str = genToken(po)
# to bar
toBar(no_token)
toBar(po_token)
# to word cloud
toCloud(no_str)
toCloud(po_str)
cluster6 = a.get_group(5)
no = " ".join([i for i in list(cluster6["Negative Objects"].dropna())])
po = " ".join([i for i in list(cluster6["Positive Objects"].dropna())])
no_token,no_str = genToken(no)
po_token,po_str = genToken(po)
# to bar
toBar(no_token)
toBar(po_token)
# to word cloud
toCloud(no_str)
toCloud(po_str)